load data wrangling package
library(tidyverse)
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.2 ✔ dplyr 0.8.1
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
load data
dat_raw <- read_csv("data.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## EZ = col_double(),
## PLZ = col_double(),
## ON = col_double(),
## GstFl = col_double(),
## Gebäudehöhe = col_double(),
## Geschoße = col_double(),
## Zähler = col_double(),
## Nenner = col_double(),
## BJ = col_double(),
## Kaufpreis = col_double(),
## mGfl = col_double(),
## percentWidmung = col_double(),
## aufEZ = col_double(),
## one_more_time = col_logical(),
## lon = col_double(),
## lat = col_double(),
## price_m2 = col_double(),
## price_log = col_double(),
## train = col_double(),
## cluster = col_double()
## )
## See spec(...) for full column specifications.
take a look at data
(Note: data is from Vienna’s land charge register to which I have added price clusters)
dat_raw
quick visualization
## get map
library(ggmap)
vienna <- make_bbox(lon, lat, dat_raw) %>%
get_stamenmap(zoom = 13, maptype = c("toner-lite"))
ggmap(vienna, extent = "device") +
coord_cartesian() +
geom_point(data = dat_raw %>%
filter(train==1),
aes(x=lon, y=lat, color = as.factor(cluster)),
alpha=.5) +
geom_point(data = dat_raw %>%
filter(train==0),
aes(x=lon, y=lat),
alpha=.5, shape=4) +
theme(legend.position="none")
add id variable (= row number to data to identify individual cases)
dat_id <- dat_raw %>%
mutate(id = row_number())
select columns we will work with (id, lon, lat, cluster)
dat_select <- dat_id %>%
select(id, lon, lat, cluster)
separate training and test data
## test data has no assigned 'cluster'
dat_test <- dat_select %>%
filter(is.na(cluster)) %>%
## remove cluster, because it is missing anyway
select(-cluster)
## get train data by anti join with original data
dat_train <- dat_select %>%
anti_join(dat_test, by="id")
cross training with test data (combine each case in training set with each case in the test set)
dat_expand <- dat_train %>%
crossing(dat_test)
take a quick glance
dat_expand
calculate euclidean distance (= square root of the sum of distance squares)
dat_dist <- dat_expand %>%
mutate(dist = sqrt((lat-lat1)^2 + (lon-lon1)^2))
get the k (=5) nearest cases per test set case
dat_5nn <- dat_dist %>%
## segregate by id1
group_by(id1) %>%
## get the 5 cases with smallest dist
top_n(-5, dist) %>%
## sort by id1
arrange(id1)
get mayority vote
dat_classified <- dat_5nn %>%
## segregate by id1 and cluster
group_by(id1, cluster) %>%
## get counts per resulting group
count() %>%
## reduce to highest count
top_n(1, n) %>%
## remove 'n'
select(-n)
add result to test data
dat_final <- dat_test %>%
full_join(dat_classified, by=c("id"="id1"))
visually check result
ggmap(vienna, extent = "device") +
coord_cartesian() +
geom_point(data = bind_rows(dat_train, dat_final),
aes(x=lon, y=lat, color = as.factor(cluster)),
alpha=.5) +
theme(legend.position="none")
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
for reproducability
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.3
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] de_AT.UTF-8/de_AT.UTF-8/de_AT.UTF-8/C/de_AT.UTF-8/de_AT.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggmap_3.0.0 forcats_0.4.0 stringr_1.4.0 dplyr_0.8.1
## [5] purrr_0.3.2 readr_1.3.1 tidyr_0.8.3 tibble_2.1.2
## [9] ggplot2_3.1.1 tidyverse_1.2.1
##
## loaded via a namespace (and not attached):
## [1] tidyselect_0.2.5 xfun_0.7 haven_2.1.0
## [4] lattice_0.20-38 colorspace_1.4-1 generics_0.0.2
## [7] htmltools_0.3.6 yaml_2.2.0 rlang_0.3.4
## [10] pillar_1.4.1 glue_1.3.1 withr_2.1.2
## [13] modelr_0.1.4 readxl_1.3.1 jpeg_0.1-8
## [16] plyr_1.8.4 munsell_0.5.0 gtable_0.3.0
## [19] cellranger_1.1.0 rvest_0.3.4 RgoogleMaps_1.4.3
## [22] evaluate_0.14 labeling_0.3 knitr_1.23
## [25] curl_3.3 broom_0.5.2 Rcpp_1.0.1
## [28] scales_1.0.0 backports_1.1.4 jsonlite_1.6
## [31] rjson_0.2.20 hms_0.4.2 png_0.1-7
## [34] digest_0.6.19 stringi_1.4.3 grid_3.6.0
## [37] cli_1.1.0 tools_3.6.0 bitops_1.0-6
## [40] magrittr_1.5 lazyeval_0.2.2 crayon_1.3.4
## [43] pkgconfig_2.0.2 xml2_1.2.0 lubridate_1.7.4
## [46] assertthat_0.2.1 rmarkdown_1.13 httr_1.4.0
## [49] rstudioapi_0.10 R6_2.4.0 nlme_3.1-139
## [52] compiler_3.6.0